This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
library(sampling)
library(data.table)
library(jsonlite)
library(dplyr)
library(tidyr)
library(lubridate)
library(readr)
library(UsingR)
library(plotly)
library(gepaf)
library(leaflet)
TKeyAPIV3 <- "?api_key=29655b2934ca4235add916cb22e9b9f2"
TKeyAPIV2 <- "?api_key=DOb5b6UKM0uecKLrPskA7A"
#APIV3URLS
TRoutesURL3 <- "https://api-v3.mbta.com/routes"
TStopsURL3 <- "https://api-v3.mbta.com/stops"
TVehiclesURL3 <- "https://api-v3.mbta.com/vehicles"
TShapesURL3 <- "https://api-v3.mbta.com/shapes?route="
TSchedulesURL3 <- "https://api-v3.mbta.com/schedules?route="
TPredictionsURL3 <- "https://api-v3.mbta.com/predictions?route="
#APIV2 URLS
TRouteURL2 <- "http://realtime.mbta.com/developer/api/v2/stopsbyroute"
TTravelURL2 <- "http://realtime.mbta.com/developer/api/v2.1/traveltimes"
THeadwaysURL2 <- "http://realtime.mbta.com/developer/api/v2.1/headways"
TDwellsURL2 <- "http://realtime.mbta.com/developer/api/v2.1/dwells"
TFormat <- "&format=json"
This is the basic dataset directly downloaded from http://www.mbtabackontrack.com/performance/index.html#/download
RidData<-data.frame(read.csv("TDashboardData_ridership.csv"))
CSData<-data.frame(read.csv("TDashboardData_customersatisfaction.csv"))
FData<-data.frame(read.csv("TDashboardData_financials.csv"))
Glimpse of the CSV Datasets
| SERVICE_MONTH | MODE_TYPE | TOTAL_WEEKDAY_RIDERSHIP_COUNT | NUMBER_OF_WEEKDAYS | AVERAGE_WEEKDAY_RIDERSHIP_COUNT | ROUTE_OR_LINE |
|---|---|---|---|---|---|
| 2015-01-01 00:00:00 | RAIL | 1154894 | 20 | 57744.68 | BLUE LINE |
| 2015-01-01 00:00:00 | BUS | 7144221 | 20 | 357211.05 | BUS |
| 2015-01-01 00:00:00 | COMMUTER RAIL | 2436317 | 20 | 121815.87 | COMMUTER RAIL |
| 2015-01-01 00:00:00 | FERRY | 51665 | 20 | 2583.26 | F1-HINGHAM-BOSTON |
| 2015-01-01 00:00:00 | FERRY | 13848 | 20 | 692.41 | F2_F2H-Quincy-Boston-Logan-Hull-Georges |
| 2015-01-01 00:00:00 | FERRY | 6070 | 20 | 303.50 | F4-BOSTON-CHARLESTOWN |
| Survey.Date | Question.Group | Question.Text | Respondents | Average.Rating.Out.of.7 | Response.1.Text | Response.1.Percent | Response.2.Text | Response.2.Percent | Response.3.Text | Response.3.Percent | Response.4.Text | Response.4.Percent | Response.5.Text | Response.5.Percent | Response.6.Text | Response.6.Percent | Response.7.Text | Response.7.Percent |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2016-02-01 00:00:00 | Top Line | How would you rate the mbta overall? | 519 | 4.52 | Extremely Dissatisfied | 0.0303 | Very Dissatisfied | 0.0902 | Somewhat Dissatisfied | 0.1595 | Neutral | 0.0739 | Somewhat Satisfied | 0.3704 | Very Satisfied | 0.2418 | Extremely Satisfied | 0.0339 |
| 2016-02-01 00:00:00 | Top Line | How would you rate this trip overall? | 502 | 4.80 | Extremely Dissatisfied | 0.0496 | Very Dissatisfied | 0.0731 | Somewhat Dissatisfied | 0.0966 | Neutral | 0.1033 | Somewhat Satisfied | 0.2475 | Very Satisfied | 0.3412 | Extremely Satisfied | 0.0887 |
| 2016-02-01 00:00:00 | Top Line | How likely are you to continue using the mbta in the future? | 502 | 6.37 | Extremely Unlikely | 0.0035 | Very Unlikely | 0.0089 | Unlikely | 0.0026 | Neither Likely nor Unlikely | 0.0434 | Likely | 0.0684 | Very Likely | 0.2848 | Extremely Likely | 0.5884 |
| 2016-02-01 00:00:00 | Top Line | How likely are you to recommend the mbta to a friend or colleague? | 490 | 5.13 | Extremely Unlikely | 0.0589 | Very Unlikely | 0.0468 | Unlikely | 0.0717 | Neither Likely nor Unlikely | 0.1493 | Likely | 0.1779 | Very Likely | 0.1959 | Extremely Likely | 0.2995 |
| 2016-02-01 00:00:00 | Perceptions | The mbta provides reliable public transportation services. | 490 | 4.04 | Strongly Disagree | 0.1366 | Disagree | 0.1258 | Slightly Disagree | 0.1539 | Neither Agree nor Disagree | 0.0708 | Slightly Agree | 0.2228 | Agree | 0.2358 | Strongly Agree | 0.0543 |
| 2016-02-01 00:00:00 | Perceptions | The mbta is a cost-conscious organization. | 490 | 3.16 | Strongly Disagree | 0.2570 | Disagree | 0.1626 | Slightly Disagree | 0.1463 | Neither Agree nor Disagree | 0.1896 | Slightly Agree | 0.1121 | Agree | 0.1041 | Strongly Agree | 0.0282 |
| Reporting.Fiscal.Year | Reporting.Date | Category | Subcategory | Month.to.Date.Actual.Amount | Month.to.Date.Budgeted.Amount | Year.to.Date.Actual.Amount | Year.to.Date.Budgeted.Amount |
|---|---|---|---|---|---|---|---|
| 2015 | 2015-06-30 00:00:00 | Additional State Assistance | 8579385 | 24591674 | 125352620 | 295100000 | |
| 2015 | 2015-06-30 00:00:00 | Expenses | Debt Service | 24729468 | 32739023 | 413439712 | 423938425 |
| 2015 | 2015-06-30 00:00:00 | Expenses | Operating Expenses | 145087793 | 124309484 | 1508820862 | 1508921410 |
| 2015 | 2015-06-30 00:00:00 | Revenue | Non-Operating Revenues | 142500324 | 121956788 | 1157292600 | 1001817915 |
| 2015 | 2015-06-30 00:00:00 | Revenue | Operating Revenues | 57387316 | 56642014 | 645968041 | 646174787 |
| 2015 | 2015-06-30 00:00:00 | Transfers In | 0 | 0 | 0 | 0 |
MBTARidership <- plot_ly(x=format(as.Date(unique(RidData$SERVICE_MONTH)), "%y-%m"),
y = aggregate(TOTAL_WEEKDAY_RIDERSHIP_COUNT ~ SERVICE_MONTH, RidData, sum)$TOTAL_WEEKDAY_RIDERSHIP_COUNT,name = "Ridership", type = "bar") %>%
add_lines(x=format(as.Date(unique(CSData$Survey.Date)), "%y-%m"),y=aggregate(Average.Rating.Out.of.7 ~ Survey.Date, CSData, mean)$Average.Rating.Out.of.7, name = "Customer Satisfaction", yaxis = "y2") %>%
layout(title = "MBTA Ridership and Satisfaction", yaxis = list(title = "Frequency"), yaxis2 = list(overlaying = "y", side = "right", title = "Satisfaction (on a scale 7)"))
The Ridership stays high during March to October and usually decreases during the winter (December - April)
MBTAExpenseRev <- plot_ly(x=format(as.Date(unique(FData$Reporting.Date)), "%y-%m"),
y = aggregate(Month.to.Date.Actual.Amount ~ Reporting.Date, subset(FData,Category=="Expenses"), sum)$Month.to.Date.Actual.Amount,name = "Expense", type = "bar") %>%
add_trace(x=format(as.Date(unique(FData$Reporting.Date)), "%y-%m"), y = aggregate(Month.to.Date.Actual.Amount ~ Reporting.Date, subset(FData,Category=="Revenue"), sum)$Month.to.Date.Actual.Amount, name = "Revenue") %>%
layout(title = "MBTA Monthly Expense and Revenue", yaxis = list(title = "Cost"))
The gap between the expense and the revenue has been closing down in 2017 unlike 2 years back.
RoutesComplete<-fromJSON(paste(TRoutesURL3))$data
#[1:8] pickout only the subway list
RoutesList<-unique(subset(RoutesComplete, RoutesComplete$attributes$description == "Rapid Transit")$id)[1:8]
Glimpse of the Routes data (RoutesComplete)
## type self id attributes.type attributes.sort_order
## 1 route /routes/Red Red 1 1
## 2 route /routes/Orange Orange 1 2
## 3 route /routes/Green-B Green-B 0 3
## 4 route /routes/Green-C Green-C 0 4
## 5 route /routes/Green-D Green-D 0 5
## 6 route /routes/Green-E Green-E 0 6
## attributes.short_name attributes.long_name attributes.direction_names
## 1 Red Line Southbound, Northbound
## 2 Orange Line Southbound, Northbound
## 3 B Green Line B Westbound, Eastbound
## 4 C Green Line C Westbound, Eastbound
## 5 D Green Line D Westbound, Eastbound
## 6 E Green Line E Westbound, Eastbound
## attributes.description
## 1 Rapid Transit
## 2 Rapid Transit
## 3 Rapid Transit
## 4 Rapid Transit
## 5 Rapid Transit
## 6 Rapid Transit
RidershipByMode<-aggregate(TOTAL_WEEKDAY_RIDERSHIP_COUNT ~ MODE_TYPE,RidData,sum)
RidershipByModePie <- plot_ly(RidershipByMode, labels = ~MODE_TYPE, values = ~TOTAL_WEEKDAY_RIDERSHIP_COUNT, type = 'pie',textposition = 'inside', textinfo = 'label+percent') %>%
layout(title = 'Total Ridership by Mode of Transport')
RidershipByRoute<-aggregate(TOTAL_WEEKDAY_RIDERSHIP_COUNT ~ ROUTE_OR_LINE ,RidData,sum)
RidershipByRouteBar <- plot_ly(RidershipByRoute, x = ~ROUTE_OR_LINE, y = ~TOTAL_WEEKDAY_RIDERSHIP_COUNT, color = ~ROUTE_OR_LINE, type = 'bar') %>%
layout(title = 'Total Weekday Ridership in a month', yaxis=list(title='Ridership'), xaxis=list(title=''))
NumOfRoutesByType <- plot_ly(
x = names(table(RoutesComplete$attributes$description)),
y = as.numeric(table(RoutesComplete$attributes$description)),
type = "bar", color=names(table(RoutesComplete$attributes$description))) %>%
layout(title = 'Number of Routes in MBTA', yaxis = list(title = "Frequency"))
if(file.exists("SchedulesComplete.rds")) {
SchedulesComplete <- readRDS("SchedulesComplete.rds")
} else {
SchedulesComplete <- data.frame(Date=as.Date(character()),
File=character(),
User=character(),
stringsAsFactors=FALSE)
for(i in RoutesList) {
SchedulesComplete<-rbind(SchedulesComplete, data.frame(fromJSON(paste(TSchedulesURL3,i,sep=""), flatten = TRUE)))
}
saveRDS(SchedulesComplete, "SchedulesComplete.rds")
}
## version data.type data.id
## 1 1.0 schedule schedule-34708374-70061-1
## 2 1.0 schedule schedule-34708374-70063-10
## 3 1.0 schedule schedule-34708374-70065-20
## 4 1.0 schedule schedule-34708374-70067-30
## 5 1.0 schedule schedule-34708374-70069-40
## 6 1.0 schedule schedule-34708374-70071-50
## data.relationships.trip.data.type data.relationships.trip.data.id
## 1 trip 34708374
## 2 trip 34708374
## 3 trip 34708374
## 4 trip 34708374
## 5 trip 34708374
## 6 trip 34708374
## data.relationships.stop.data.type data.relationships.stop.data.id
## 1 stop 70061
## 2 stop 70063
## 3 stop 70065
## 4 stop 70067
## 5 stop 70069
## 6 stop 70071
## data.relationships.route.data.type data.relationships.route.data.id
## 1 route Red
## 2 route Red
## 3 route Red
## 4 route Red
## 5 route Red
## 6 route Red
## data.attributes.timepoint data.attributes.stop_sequence
## 1 FALSE 1
## 2 FALSE 10
## 3 FALSE 20
## 4 FALSE 30
## 5 FALSE 40
## 6 FALSE 50
## data.attributes.pickup_type data.attributes.drop_off_type
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## data.attributes.departure_time data.attributes.arrival_time
## 1 2017-12-09T07:37:00-05:00 2017-12-09T07:37:00-05:00
## 2 2017-12-09T07:39:00-05:00 2017-12-09T07:39:00-05:00
## 3 2017-12-09T07:41:00-05:00 2017-12-09T07:41:00-05:00
## 4 2017-12-09T07:44:00-05:00 2017-12-09T07:44:00-05:00
## 5 2017-12-09T07:47:00-05:00 2017-12-09T07:47:00-05:00
## 6 2017-12-09T07:50:00-05:00 2017-12-09T07:50:00-05:00
NumOfTripsByType <- plot_ly(
x = names(table(SchedulesComplete$data.relationships.route.data.id)),
y = as.numeric(table(SchedulesComplete$data.relationships.route.data.id)),
type = "bar",
color = names(table(SchedulesComplete$data.relationships.route.data.id))) %>%
layout(title = "Number of Scheduled Trips in a day (Saturday)", yaxis = list(title = "Frequency"))
Code in RMarkdown source
startTime <- as.POSIXct("2017-12-01 04:00:00")
endTime <- as.POSIXct("2017-12-08 04:00:00")
numWeeks <- as.integer(unclass(endTime - startTime)/7)
fromTime <- paste("&from_datetime=",as.numeric(startTime), sep="")
toTime <- paste("&to_datetime=",as.numeric(endTime), sep="")
Code in RMarkdown Source
GreenLineBTravelTimes provides the travel times from each station to the nearest two stations within the same line. For Sampling I took 100 samples from the CombinedAllstonBUEast data which generates the travel times from Allston St to BU East from the processed. The processing step looping through each station is hidden within the RMarkdown.
set.seed(123)
#Glimpse of GreenLineBTravelTimes
head(GreenLineBTravelTimes)
## from_stop from_name from_lat from_lon to_stop to_name
## 1 70208 Science Park 42.366664 -71.067666 70206 North Station
## 2 70206 North Station 42.365577 -71.06129 70204 Haymarket
## 3 70204 Haymarket 42.363021 -71.05829 70202 Government Center
## 4 70208 Science Park 42.366664 -71.067666 70206 North Station
## 5 70159 Boylston 42.35302 -71.06459 70157 Arlington
## 6 70206 North Station 42.365577 -71.06129 70204 Haymarket
## to_lat to_lon direction dep_dt arr_dt
## 1 42.365577 -71.06129 0 2017-12-01 05:04:27 2017-12-01 05:06:54
## 2 42.363021 -71.05829 0 2017-12-01 05:07:38 2017-12-01 05:08:33
## 3 42.359705 -71.059215 0 2017-12-01 05:09:30 2017-12-01 05:10:36
## 4 42.365577 -71.06129 0 2017-12-01 05:12:32 2017-12-01 05:15:27
## 5 42.351902 -71.070893 0 2017-12-01 05:15:44 2017-12-01 05:17:33
## 6 42.363021 -71.05829 0 2017-12-01 05:16:00 2017-12-01 05:17:00
## travel_time_sec benchmark_travel_time_sec dep_d dep_t arr_d
## 1 147 180 2017-12-01 05:04:27 2017-12-01
## 2 55 120 2017-12-01 05:07:38 2017-12-01
## 3 66 120 2017-12-01 05:09:30 2017-12-01
## 4 175 180 2017-12-01 05:12:32 2017-12-01
## 5 109 120 2017-12-01 05:15:44 2017-12-01
## 6 60 120 2017-12-01 05:16:00 2017-12-01
## arr_t
## 1 05:06:54
## 2 05:08:33
## 3 05:10:36
## 4 05:15:27
## 5 05:17:33
## 6 05:17:00
nrow(GreenLineBTravelTimes)
## [1] 82681
SamplingPlot <- plot_ly(x=GreenLineBTravelTimes$travel_time_sec, type="histogram", name="All GL stop pairs") %>%
layout(title="Green Line B travel time distribution", xaxis=list(range=c(0,200) , yaxis=list(title="Frequency")))
AllstonToBUEast<-fromJSON(paste(TTravelURL2, TKeyAPIV2, TFormat, "&from_stop=", 70126, "&to_stop=", 70146, fromTime, toTime, sep=""))[[1]][c(1:5)]
BUEastToAllston<-fromJSON(paste(TTravelURL2, TKeyAPIV2, TFormat, "&from_stop=", 70146, "&to_stop=", 70126, fromTime, toTime, sep=""))[[1]][c(1:5)]
CombinedAllstonBUEast<-rbind(AllstonToBUEast,BUEastToAllston)
head(CombinedAllstonBUEast)
## route_id direction dep_dt arr_dt travel_time_sec
## 1 Green-B 1 1512123340 1512124110 770
## 2 Green-B 1 1512123898 1512124613 715
## 3 Green-B 1 1512124507 1512125403 896
## 4 Green-B 1 1512125315 1512126156 841
## 5 Green-B 1 1512125940 1512126673 733
## 6 Green-B 1 1512126526 1512127434 908
CombinedAllstonBUEast$travel_time_sec<-as.numeric(CombinedAllstonBUEast$travel_time_sec)
SamplingPlot1 <- plot_ly(x=as.numeric(CombinedAllstonBUEast$travel_time_sec), type="histogram", name="Allston -> BU East") %>%
layout(title="Travel time distribution", xaxis=list(range=c(300,1700)), yaxis=list(title="Frequency"))
#Testing Central Limit Theorem
GreenLineBSubset<-aggregate(travel_time_sec ~ ., GreenLineBTravelTimes[c(2,6,9,12)], mean)
GreenLineBSubset$travel_time_sec<-as.numeric(GreenLineBSubset$travel_time_sec)
head(GreenLineBSubset)
## from_name to_name direction travel_time_sec
## 1 Griggs Street Allston Street 0 19.95825
## 2 Boylston Arlington 0 94.59102
## 3 Pleasant Street Babcock Street 0 42.86942
## 4 Kenmore Blandford Street 0 39.78655
## 5 Boston Univ. East Boston Univ. Central 0 41.92593
## 6 Blandford Street Boston Univ. East 0 65.59015
fit.subset <- density(GreenLineBSubset$travel_time_sec)
SamplingPlot2 <- plot_ly(x = GreenLineBSubset$travel_time_sec, type = "histogram", name="Histogram") %>%
add_trace(x = fit.subset$x, y = fit.subset$y, type = "scatter", mode = "lines+markers", yaxis = "y2", name = "Density") %>%
layout(title="Green Line B travel time distribution (Aggregate)", yaxis=list(title="Frequency"), yaxis2 = list(overlaying = "y", side = "right"))
fit.allston.bu <- density(CombinedAllstonBUEast$travel_time_sec)
SamplingPlot2.1 <- plot_ly(x = CombinedAllstonBUEast$travel_time_sec, type = "histogram", name="Histogram") %>%
add_trace(x = fit.allston.bu$x, y = fit.allston.bu$y, type = "scatter", mode = "lines+markers", yaxis = "y2", name = "Density") %>%
layout(title="Allston St -> BU East time distribution", xaxis=list(range=c(300,1700)), yaxis=list(title="Frequency"), yaxis2 = list(overlaying = "y", side = "right"))
#Sampling with 10000 samples with sample size 5
samples.5<-10000
sample.size.5<-5
xbar.5<-numeric(samples.5)
meanGL.5<-mean(CombinedAllstonBUEast$travel_time_sec)
sdGL.5<-sd(CombinedAllstonBUEast$travel_time_sec)
for (i in 1: samples.5) {
xbar.5[i] <- mean(rnorm(sample.size.5,
meanGL.5, sdGL.5))
}
SamplingPlot3 <- plot_ly(x = xbar.5, type = "histogram", histnorm = "probability", name="Size = 5") %>%
layout(title="Allston St -> BU East travel time distribution - 10000 samples", xaxis=list(range=c(400,1600)))
#Sampling with 10000 samples with sample size 10
samples.10<-10000
sample.size.10<-10
xbar.10<-numeric(samples.10)
meanGL.10<-mean(CombinedAllstonBUEast$travel_time_sec)
sdGL.10<-sd(CombinedAllstonBUEast$travel_time_sec)
for (i in 1: samples.10) {
xbar.10[i] <- mean(rnorm(sample.size.10,
meanGL.10, sdGL.10))
}
SamplingPlot4 <- plot_ly(x = xbar.10, type = "histogram", histnorm = "probability",name="Size = 10") %>%
layout(title="Allston St -> BU East travel time distribution - 10000 samples", xaxis=list(range=c(400,1600)))
#Sampling with 10000 samples with sample size 15
samples.15<-10000
sample.size.15<-15
xbar.15<-numeric(samples.15)
meanGL.15<-mean(CombinedAllstonBUEast$travel_time_sec)
sdGL.15<-sd(CombinedAllstonBUEast$travel_time_sec)
for (i in 1: samples.15) {
xbar.15[i] <- mean(rnorm(sample.size.15,
meanGL.15, sdGL.15))
}
SamplingPlot5 <- plot_ly(x = xbar.15, type = "histogram", histnorm = "probability",name="Size = 15") %>%
layout(title="Allston St -> BU East travel time distribution - 10000 samples", xaxis=list(range=c(400,1600)))
#Sampling with 10000 samples with sample size 20
samples.20<-10000
sample.size.20<-20
xbar.20<-numeric(samples.20)
meanGL.20<-mean(CombinedAllstonBUEast$travel_time_sec)
sdGL.20<-sd(CombinedAllstonBUEast$travel_time_sec)
for (i in 1: samples.20) {
xbar.20[i] <- mean(rnorm(sample.size.20,
meanGL.20, sdGL.20))
}
SamplingPlot6 <- plot_ly(x = xbar.20, type = "histogram", histnorm = "probability" ,name="Size = 20") %>%
layout(title="Allston St -> BU East travel time distribution - 10000 samples", xaxis=list(range=c(400,1600)))
s <- srswr(100, nrow(CombinedAllstonBUEast))
rows <- (1:nrow(CombinedAllstonBUEast))[s!=0]
rows <- rep(rows, s[s != 0])
sample.1 <- CombinedAllstonBUEast[rows, ]
fit.sample.1<-density(sample.1$travel_time_sec)
SamplingPlot7 <- plot_ly(x=sample.1$travel_time_sec, type="histogram", name="SRSWR") %>%
add_trace(x = fit.sample.1$x, y = fit.sample.1$y, type = "scatter", fill = "tozeroy", mode = "lines", yaxis = "y2", name = "Density") %>%
layout(title="Simple Random Sampling : Allston St -> BU East travel Time distribution", xaxis=list(range=c(300,1700)), yaxis2 = list(overlaying = "y", side = "right"))
s <- srswor(100,nrow(CombinedAllstonBUEast))
sample.2 <- CombinedAllstonBUEast[s != 0, ]
fit.sample.2=density(sample.2$travel_time_sec)
SamplingPlot8 <- plot_ly(x=sample.2$travel_time_sec, type="histogram", name="SRSWOR") %>%
add_trace(x = fit.sample.2$x, y = fit.sample.2$y, type = "scatter", fill = "tozeroy", mode = "lines", yaxis = "y2", name = "Density") %>%
layout(title="Simple Random Sampling : Allston St -> BU East travel Time distribution", xaxis=list(range=c(300,1700)), yaxis2 = list(overlaying = "y", side = "right"))
#Systematic
N <- nrow(CombinedAllstonBUEast)
n <- 100
k <- ceiling(N / n)
r <- sample(k, 1)
# select every kth item
s <- seq(r, by = k, length = n)
sample.3 <- CombinedAllstonBUEast[s, ]
sample.3<-na.omit(sample.3)
fit.sample.3<-density(sample.3$travel_time_sec)
SamplingPlot9 <- plot_ly(x=sample.3$travel_time_sec, type="histogram", name="Systematic") %>%
add_trace(x = fit.sample.3$x, y = fit.sample.3$y, type = "scatter", fill = "tozeroy", mode = "lines", yaxis = "y2", name = "Density") %>%
layout(title="Systematic Sampling : Allston St -> BU East travel Time distribution", xaxis=list(range=c(300,1700)), yaxis2 = list(overlaying = "y", side = "right"))
fit.org.data = density(CombinedAllstonBUEast$travel_time_sec)
SamplingPlot10 <- plot_ly(x=as.numeric(CombinedAllstonBUEast$travel_time_sec), type = "histogram", name = "Histogram") %>%
add_trace(x = fit.org.data$x, y = fit.org.data$y, type = "scatter", mode = "lines", yaxis = "y2", name = "Original") %>%
add_trace(x = fit.sample.1$x, y = fit.sample.1$y, type = "scatter", mode = "lines", yaxis = "y2", name = "SRSWR") %>%
add_trace(x = fit.sample.2$x, y = fit.sample.2$y, type = "scatter", mode = "lines", yaxis = "y2", name = "SRSWOR") %>%
add_trace(x = fit.sample.3$x, y = fit.sample.3$y, type = "scatter", mode = "lines", yaxis = "y2", name = "Systematic")%>%
layout(title="Sampling methods combined - Allston St -> BU East travel time", xaxis=list(range=c(300,1700)), yaxis2 = list(overlaying = "y", side = "right"))
conf<-c(80,90)
alpha<- 1-conf/100
sample.size.10<-10
sd.sample.means.10<-sd(as.numeric(CombinedAllstonBUEast$travel_time_sec))/sqrt(sample.size.10)
sample.data.10<-sample(as.numeric(CombinedAllstonBUEast$travel_time_sec), size=sample.size.10)
xbar.10<-mean(sample.data.10)
confdf.10 <- data.frame(Date=as.Date(character()),
File=character(),
User=character(),
stringsAsFactors=FALSE)
for(i in alpha) {
cf.10<-c(100*(1-i),xbar.10-qnorm(1-i/2)*sd.sample.means.10,xbar.10+qnorm(1-i/2)*sd.sample.means.10)
confdf.10<-rbind(confdf.10,cf.10)
str<-sprintf("%2d%% Conf Level (alpha=%.2f), CI=%.2f-%.2f",100*(1-i),i,xbar.10-qnorm(1-i/2)*sd.sample.means.10,xbar.10+qnorm(1-i/2)*sd.sample.means.10)
cat(str,"\n")
}
## 80% Conf Level (alpha=0.20), CI=843.13-976.07
## 90% Conf Level (alpha=0.10), CI=824.28-994.92
colnames(confdf.10)<-c("conf","ll","ul")
fit.sample.conf.10<-density(sample.data.10)
ConfidencePlot1 <- plot_ly(x=sample.data.10, type="histogram", name="Sampled Data") %>%
add_trace(x = fit.sample.conf.10$x, y = fit.sample.conf.10$y, type = "scatter", fill = "tozeroy", mode = "lines", yaxis = "y2", name = "Density") %>%
add_trace(x = rep(mean(CombinedAllstonBUEast$travel_time_sec),2), y = c(0,25) , type = "scatter", color = "black", mode = "lines", name = "Mean") %>%
add_trace(x = c(confdf.10$ll[1],confdf.10$ll[1],confdf.10$ul[1],confdf.10$ul[1]), y = c(0,25,25,0) , type = "scatter", fill = "tozeroy", color = "red", mode = "lines", name = "80% conf") %>%
add_trace(x = c(confdf.10$ll[2],confdf.10$ll[2],confdf.10$ul[2],confdf.10$ul[2]), y = c(0,25,25,0) , type = "scatter", fill = "tozeroy", color = "blue", mode = "lines", name = "90% conf") %>%
add_trace(x = rep(mean(CombinedAllstonBUEast$travel_time_sec),2), y = c(0,25) , type = "scatter", color = "red", mode = "lines", name = "Mean") %>%
layout(title="Confidence Interval - Sample Size 10 : Allston St -> BU East travel Time distribution", xaxis=list(range=c(300,1700)), yaxis2 = list(overlaying = "y", side = "right"))
#For Sample size 50
sample.size.50<-50
sd.sample.means.50<-sd(as.numeric(CombinedAllstonBUEast$travel_time_sec))/sqrt(sample.size.50)
sample.data.50<-sample(as.numeric(CombinedAllstonBUEast$travel_time_sec), size=sample.size.50)
xbar.50<-mean(sample.data.50)
confdf.50 <- data.frame(Date=as.Date(character()),
File=character(),
User=character(),
stringsAsFactors=FALSE)
for(i in alpha) {
cf.50<-c(100*(1-i),xbar.50-qnorm(1-i/2)*sd.sample.means.50,xbar.50+qnorm(1-i/2)*sd.sample.means.50)
confdf.50<-rbind(confdf.50,cf.50)
str<-sprintf("%2d%% Conf Level (alpha=%.2f), CI=%.2f-%.2f",100*(1-i),i,xbar.50-qnorm(1-i/2)*sd.sample.means.50,xbar.50+qnorm(1-i/2)*sd.sample.means.50)
cat(str,"\n")
}
## 80% Conf Level (alpha=0.20), CI=834.69-894.15
## 90% Conf Level (alpha=0.10), CI=826.26-902.58
colnames(confdf.50)<-c("conf","ll","ul")
fit.sample.conf.50<-density(sample.data.50)
ConfidencePlot2 <- plot_ly(x=sample.data.50, type="histogram", name="Sampled Data") %>%
add_trace(x = fit.sample.conf.50$x, y = fit.sample.conf.50$y, type = "scatter", fill = "tozeroy", mode = "lines", yaxis = "y2", name = "Density") %>%
add_trace(x = rep(mean(CombinedAllstonBUEast$travel_time_sec),2), y = c(0,25) , type = "scatter", color = "black", mode = "lines", name = "Mean") %>%
add_trace(x = c(confdf.50$ll[1],confdf.50$ll[1],confdf.50$ul[1],confdf.50$ul[1]), y = c(0,25,25,0) , type = "scatter", fill = "tozeroy", color = "red", mode = "lines", name = "80% conf") %>%
add_trace(x = c(confdf.50$ll[2],confdf.50$ll[2],confdf.50$ul[2],confdf.50$ul[2]), y = c(0,25,25,0) , type = "scatter", fill = "tozeroy", color = "blue", mode = "lines", name = "90% conf") %>%
add_trace(x = rep(mean(CombinedAllstonBUEast$travel_time_sec),2), y = c(0,25) , type = "scatter", color = "red", mode = "lines", name = "Mean") %>%
layout(title="Confidence Interval - Sample Size 50 : Allston St -> BU East travel Time distribution", xaxis=list(range=c(300,1700)), yaxis2 = list(overlaying = "y", side = "right"))
#For Sample size 75
sample.size.75<-75
sd.sample.means.75<-sd(as.numeric(CombinedAllstonBUEast$travel_time_sec))/sqrt(sample.size.75)
sample.data.75<-sample(as.numeric(CombinedAllstonBUEast$travel_time_sec), size=sample.size.75)
xbar.75<-mean(sample.data.75)
confdf.75 <- data.frame(Date=as.Date(character()),
File=character(),
User=character(),
stringsAsFactors=FALSE)
for(i in alpha) {
cf.75<-c(100*(1-i),xbar.75-qnorm(1-i/2)*sd.sample.means.75,xbar.75+qnorm(1-i/2)*sd.sample.means.75)
confdf.75<-rbind(confdf.75,cf.75)
str<-sprintf("%2d%% Conf Level (alpha=%.2f), CI=%.2f-%.2f",100*(1-i),i,xbar.75-qnorm(1-i/2)*sd.sample.means.75,xbar.75+qnorm(1-i/2)*sd.sample.means.75)
cat(str,"\n")
}
## 80% Conf Level (alpha=0.20), CI=877.51-926.06
## 90% Conf Level (alpha=0.10), CI=870.63-932.94
colnames(confdf.75)<-c("conf","ll","ul")
fit.sample.conf.75<-density(sample.data.75)
ConfidencePlot3 <- plot_ly(x=sample.data.75, type="histogram", name="Sampled Data") %>%
add_trace(x = fit.sample.conf.75$x, y = fit.sample.conf.75$y, type = "scatter", fill = "tozeroy", mode = "lines", yaxis = "y2", name = "Density") %>%
add_trace(x = rep(mean(CombinedAllstonBUEast$travel_time_sec),2), y = c(0,25) , type = "scatter", color = "black", mode = "lines", name = "Mean") %>%
add_trace(x = c(confdf.75$ll[1],confdf.75$ll[1],confdf.75$ul[1],confdf.75$ul[1]), y = c(0,25,25,0) , type = "scatter", fill = "tozeroy", color = "red", mode = "lines", name = "80% conf") %>%
add_trace(x = c(confdf.75$ll[2],confdf.75$ll[2],confdf.75$ul[2],confdf.75$ul[2]), y = c(0,25,25,0) , type = "scatter", fill = "tozeroy", color = "blue", mode = "lines", name = "90% conf") %>%
add_trace(x = rep(mean(CombinedAllstonBUEast$travel_time_sec),2), y = c(0,25) , type = "scatter", color = "red", mode = "lines", name = "Mean") %>%
layout(title="Confidence Interval - Sample Size 75 : Allston St -> BU East travel Time distribution", xaxis=list(range=c(300,1700)), yaxis2 = list(overlaying = "y", side = "right"))
Increasing the sample size decreases the width of confidence intervals, because it decreases the standard error.
head(RedLineTravelTimes)
## from_stop from_name from_lat from_lon to_stop to_name to_lat
## 1 70063 Davis 42.39674 -71.121815 70065 Porter 42.3884
## 2 70065 Porter 42.3884 -71.119149 70067 Harvard 42.373362
## 3 70067 Harvard 42.373362 -71.118956 70069 Central 42.365486
## 4 70063 Davis 42.39674 -71.121815 70065 Porter 42.3884
## 5 70069 Central 42.365486 -71.103802 70071 Kendall/MIT 42.36249079
## 6 70065 Porter 42.3884 -71.119149 70067 Harvard 42.373362
## to_lon direction dep_dt arr_dt
## 1 -71.119149 0 2017-12-01 05:21:13 2017-12-01 05:22:28
## 2 -71.118956 0 2017-12-01 05:23:22 2017-12-01 05:25:11
## 3 -71.103802 0 2017-12-01 05:26:10 2017-12-01 05:29:00
## 4 -71.119149 0 2017-12-01 05:28:58 2017-12-01 05:30:08
## 5 -71.08617653 0 2017-12-01 05:29:58 2017-12-01 05:31:37
## 6 -71.118956 0 2017-12-01 05:30:59 2017-12-01 05:32:44
## travel_time_sec benchmark_travel_time_sec dep_d dep_t arr_d
## 1 75 120 2017-12-01 05:21:13 2017-12-01
## 2 109 180 2017-12-01 05:23:22 2017-12-01
## 3 170 180 2017-12-01 05:26:10 2017-12-01
## 4 70 120 2017-12-01 05:28:58 2017-12-01
## 5 99 180 2017-12-01 05:29:58 2017-12-01
## 6 105 180 2017-12-01 05:30:59 2017-12-01
## arr_t
## 1 05:22:28
## 2 05:25:11
## 3 05:29:00
## 4 05:30:08
## 5 05:31:37
## 6 05:32:44
RedLineTrvTimeBoxPlot <- plot_ly(y=as.numeric(RedLineTravelTimes$travel_time_sec), color = RedLineTravelTimes$from_name, type = "box") %>%
layout(title="Box plot of Red Line Travel Times", yaxis=list(title="Time",range=c(0,1000)))
GreenBLineTrvTimeBoxPlot <- plot_ly(y=as.numeric(GreenLineBTravelTimes$travel_time_sec), color = GreenLineBTravelTimes$from_name, type = "box") %>%
layout(title="Box plot of Green Line B Branch Travel Times", yaxis=list(title="Time",range=c(0,1000)))
A Density Plot visualises the distribution of data over a continuous interval or time period.
RLdensity <- with(RedLineTravelTimes %>% filter(direction == 1), tapply(travel_time_sec, INDEX = from_name, density))
RLdf <- data.frame(
x = unlist(lapply(RLdensity, "[[", "x")),
y = unlist(lapply(RLdensity, "[[", "y")),
from_name = rep(names(RLdensity), each = length(RLdensity[[1]]$x))
)
RedLineDensityPlotN <- plot_ly(RLdf, x = ~x, y = ~y, color = ~from_name, fill = 'tozeroy') %>%
add_lines() %>%
layout(title="Density Plot for Travel times for MBTA RL trains by departing station", yaxis=list(title="Density",range=c(0,0.4)), xaxis=list(title="NorthBound",range = c(0,600)))
RLdensity <- with(RedLineTravelTimes %>% filter(direction == 0), tapply(travel_time_sec, INDEX = from_name, density))
RLdf <- data.frame(
x = unlist(lapply(RLdensity, "[[", "x")),
y = unlist(lapply(RLdensity, "[[", "y")),
from_name = rep(names(RLdensity), each = length(RLdensity[[1]]$x))
)
RedLineDensityPlotS <- plot_ly(RLdf, x = ~x, y = ~y, color = ~from_name, fill = 'tozeroy') %>%
add_lines() %>%
layout(yaxis=list(title="Density",range=c(0,0.4)), xaxis=list(title="SouthBound",range = c(0,600)))
A Density Plot visualises the distribution of data over a continuous interval or time period.
Time spent by the train at a station PS:Green Line B data was inconsistent
RedLineDwellTimes Data
head(RedLineDwellTimes)
## route_id direction arr_dt dep_dt dwell_time_sec stop_id
## 1 Red 0 1512124096 1512124647 551 70061
## 2 Red 0 1512124553 1512125223 670 70061
## 3 Red 0 1512125048 1512125551 503 70061
## 4 Red 0 1512125788 1512126056 268 70061
## 5 Red 0 1512126522 1512126780 258 70061
## 6 Red 0 1512126708 1512126973 265 70061
## stop_name hour_time
## 1 Alewife 05
## 2 Alewife 05
## 3 Alewife 05
## 4 Alewife 05
## 5 Alewife 06
## 6 Alewife 06
df<-RedLineDwellTimes[,c(5,7,8)]
df$dwell_time_sec<-as.double(df$dwell_time_sec)
df1<-aggregate(dwell_time_sec ~ ., df, mean)
df2<-aggregate(dwell_time_sec ~ hour_time, df1, mean)
df3<-GreenLineBDwellTimes[,c(5,7,8)]
df3$dwell_time_sec<-as.double(df3$dwell_time_sec)
df4<-aggregate(dwell_time_sec ~ ., df3, mean)
df5<-aggregate(dwell_time_sec ~ hour_time, df4, mean)
df6<-OrangeLineDwellTimes[,c(5,7,8)]
df6$dwell_time_sec<-as.double(df6$dwell_time_sec)
df7<-aggregate(dwell_time_sec ~ ., df6, mean)
df8<-aggregate(dwell_time_sec ~ hour_time, df7, mean)[-c(2),]
df2new1<-cbind(df2,rep("Red",nrow(df2)))
colnames(df2new1)<-c("hour_time","dwell_time_sec","route_name")
df5new1<-cbind(df5,rep("Green-B",nrow(df5)))
colnames(df5new1)<-c("hour_time","dwell_time_sec","route_name")
df8new1<-cbind(df8,rep("Orange",nrow(df8)))
colnames(df8new1)<-c("hour_time","dwell_time_sec","route_name")
dfF<-rbind(df2new1,df5new1,df8new1)
dfF$dwell_time_sec<-as.numeric(dfF$dwell_time_sec)
RGODwellPlot<-plot_ly(dfF, x = ~hour_time, y = ~dwell_time_sec, color = ~route_name, type="bar", name="Dwell") %>%
layout(title="Dwell Times of trains during hours in a day",yaxis=list(title="Dwell time"),xaxis=list(title=""))
Headways track the time between departures at a given station. When trains are running late, headways exceed their benchmark targets. The root causes of slow service can probably be better picked apart from travel times and dwell times, but the eventual impact to riders is most cleanly seen in headways.
RedLineHeadway<-na.omit(RedLineHeadway)
head(RedLineHeadway)
## from_stop from_name from_lat from_lon to_stop to_name to_lat
## 1 70063 Davis 42.39674 -71.121815 70065 Porter 42.3884
## 2 70065 Porter 42.3884 -71.119149 70067 Harvard 42.373362
## 3 70063 Davis 42.39674 -71.121815 70065 Porter 42.3884
## 4 70067 Harvard 42.373362 -71.118956 70069 Central 42.365486
## 5 70065 Porter 42.3884 -71.119149 70067 Harvard 42.373362
## 6 70069 Central 42.365486 -71.103802 70071 Kendall/MIT 42.36249079
## to_lon prev_route_id direction current_dep_dt
## 1 -71.119149 Red 0 2017-12-01 05:28:58
## 2 -71.118956 Red 0 2017-12-01 05:30:59
## 3 -71.119149 Red 0 2017-12-01 05:33:45
## 4 -71.103802 Red 0 2017-12-01 05:34:49
## 5 -71.118956 Red 0 2017-12-01 05:35:49
## 6 -71.08617653 Red 0 2017-12-01 05:38:22
## previous_dep_dt headway_time_sec benchmark_headway_time_sec
## 1 2017-12-01 05:21:13 465 480
## 2 2017-12-01 05:23:22 457 405
## 3 2017-12-01 05:28:58 287 405
## 4 2017-12-01 05:26:10 519 420
## 5 2017-12-01 05:30:59 290 405
## 6 2017-12-01 05:29:58 504 405
## current_dep_d current_dep_t previous_dep_d previous_dep_t
## 1 2017-12-01 05:28:58 2017-12-01 05:21:13
## 2 2017-12-01 05:30:59 2017-12-01 05:23:22
## 3 2017-12-01 05:33:45 2017-12-01 05:28:58
## 4 2017-12-01 05:34:49 2017-12-01 05:26:10
## 5 2017-12-01 05:35:49 2017-12-01 05:30:59
## 6 2017-12-01 05:38:22 2017-12-01 05:29:58
RedLineHeadwayBoxPlot <- plot_ly(RedLineHeadway, y = ~(headway_time_sec), color = ~from_name, type = "box")%>%
layout(title="Headway Red Line Branch", yaxis=list(title="Headway time",range=c(0,2000)))
StopsComplete<-data.frame(fromJSON(paste(TStopsURL3))$data)
MBTAStaticMap <- leaflet() %>%
setView(lng = -71.0589, lat = 42.3601, zoom = 12) %>%
addProviderTiles("CartoDB.Positron") %>%
addPolylines(as.numeric(RedLineRoute1$stop_lon),as.numeric(RedLineRoute1$stop_lat),color="red") %>%
addCircleMarkers(as.numeric(RedLineRoute1$stop_lon),as.numeric(RedLineRoute1$stop_lat),radius=1,popup = RedLineRoute1$parent_station_name) %>%
addPolylines(as.numeric(GreenBLineRoute1$stop_lon),as.numeric(GreenBLineRoute1$stop_lat),color="green") %>%
addCircleMarkers(as.numeric(GreenBLineRoute1$stop_lon),as.numeric(GreenBLineRoute1$stop_lat),radius=1,popup = GreenBLineRoute1$parent_station_name) %>%
addPolylines(as.numeric(GreenCLineRoute1$stop_lon),as.numeric(GreenCLineRoute1$stop_lat),color="green") %>%
addCircleMarkers(as.numeric(GreenCLineRoute1$stop_lon),as.numeric(GreenCLineRoute1$stop_lat),radius=1,popup = GreenCLineRoute1$parent_station_name) %>%
addPolylines(as.numeric(GreenDLineRoute1$stop_lon),as.numeric(GreenDLineRoute1$stop_lat),color="green") %>%
addCircleMarkers(as.numeric(GreenDLineRoute1$stop_lon),as.numeric(GreenDLineRoute1$stop_lat),radius=1,popup = GreenDLineRoute1$parent_station_name) %>%
addPolylines(as.numeric(GreenELineRoute1$stop_lon),as.numeric(GreenELineRoute1$stop_lat),color="green") %>%
addCircleMarkers(as.numeric(GreenELineRoute1$stop_lon),as.numeric(GreenELineRoute1$stop_lat),radius=1,popup = GreenELineRoute1$parent_station_name) %>%
addPolylines(as.numeric(BlueLineRoute1$stop_lon),as.numeric(BlueLineRoute1$stop_lat),color="blue") %>%
addCircleMarkers(as.numeric(BlueLineRoute1$stop_lon),as.numeric(BlueLineRoute1$stop_lat),radius=1,popup = BlueLineRoute1$parent_station_name) %>%
addPolylines(as.numeric(OrangeLineRoute1$stop_lon),as.numeric(OrangeLineRoute1$stop_lat),color="orange") %>%
addCircleMarkers(as.numeric(OrangeLineRoute1$stop_lon),as.numeric(OrangeLineRoute1$stop_lat),radius=1,popup = OrangeLineRoute1$parent_station_name) %>%
addPolylines(as.numeric(MattapanLineRoute1$stop_lon),as.numeric(MattapanLineRoute1$stop_lat),color="red") %>%
addCircleMarkers(as.numeric(MattapanLineRoute1$stop_lon),as.numeric(MattapanLineRoute1$stop_lat),radius=1,popup = MattapanLineRoute1$parent_station_name)
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated some of the plots.